Word Frequency Distribution¶

Diachronic word frequencies (time slices aligned with Barbara's code).

In [216]:
#imports
import nltk
from nltk.text import Text, TextCollection
import pandas as pd
from utils.data.readCorpus import NltkCorpusFromDir, NltkCorpusFromList
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
import os, sys, itertools, re
###
from nltk.corpus.reader.plaintext import PlaintextCorpusReader, CategorizedPlaintextCorpusReader
from nltk.tokenize.simple import SpaceTokenizer, LineTokenizer
In [214]:
# we'll first define plotting function to facilitate multiplot generation
def build_multiplot(cols, subplot_type, n_items, subplot_titles, **kwargs):
    rows = divmod(n_items, cols)[0] + 1 if divmod(n_items, cols)[1] > 0 else divmod(n_items, cols)[0]
    rows_cols = [ (col, row) for col, row in itertools.product(range(1,rows+1), range(1,cols+1)) ]
    specs =  [ [ {"type": subplot_type} for col in range(1, cols+1) ] for i in range(1, rows+1) ] if subplot_type is not None else None
    
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=subplot_titles if subplot_titles is not None else None,
        specs = specs,
        **kwargs
    )
    
    return fig, rows_cols

The corpus¶

The corpus processing phase follows as close as possible BMG's workflow to keep models compatible. There are 2 exceptions:

  • all lemmas are converted to lowercase and
  • anomalous lemmas (mostly punctuation) are added to stopword list.
In [5]:
# corpus files
dir_input =  os.path.join("/home/krzys/Kod/lvlt22/BMG/LatinISE_1/")
dir_in = os.path.join(dir_input, "preprocessed_lemmas")
dir_in_words = os.path.join(dir_input, "preprocessed_tokens")
files = os.listdir(os.path.join(dir_in))
files = [f for f in files[:] if "IT" in f]

Process the metadata¶

We'll be storing corpus metadata in a data frame.

In [72]:
# metadata (BMG)
metadata_df = pd.read_csv(os.path.join(dir_input, 'latinise_metadata.csv'), sep = ",")
metadata_df = metadata_df[metadata_df['id'].str.startswith("IT")]
metadata_df.head()
metadata_df["date"] = metadata_df["date"].astype('int') #ensure we're working with integers
In [73]:
first_date = min(metadata_df.date) # BMG
last_date = max(metadata_df.date)
print(first_date)
print(last_date)
-450
2005
In [74]:
last_date = 900 # BMG

Define size of the time intervals:

In [75]:
size_interval = 450 # BMG

So there are

In [76]:
n_intervals = round((last_date-first_date)/size_interval) # BMG
n_intervals
Out[76]:
3

time intervals.

Define the time periods and split the corpus:

In [77]:
intervals = [None]*(n_intervals+1) # BMG
for t in range(n_intervals+1):
    #print(t)
    if t == 0:
        intervals[t] = first_date
    else:
        intervals[t] = intervals[t-1]+size_interval
    #print(intervals[t])
    
print(intervals)
[-450, 0, 450, 900]

Add a column to the metadata_df for the time interval:

In [78]:
metadata_df['time_interval'] = ""
for t in range(len(intervals)-1):
    print(t)
    print(range(intervals[t],intervals[t+1]))
    metadata_df_t = metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1]))]
    print(metadata_df_t.date)
    metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1])),'time_interval'] = intervals[t]
metadata_df
0
range(-450, 0)
19      -9
34     -49
39     -45
42     -49
57     -80
      ... 
635   -149
638   -107
642    -37
643    -37
649   -229
Name: date, Length: 77, dtype: int64
1
range(0, 450)
18     382
23     399
24     391
37     158
38      49
      ... 
682    382
683    116
684    116
685    116
686    116
Name: date, Length: 235, dtype: int64
2
range(450, 900)
20      524
102     800
104     800
105     800
106     800
       ... 
609     598
634     550
636     550
645     450
1265    533
Name: date, Length: 73, dtype: int64
Out[78]:
id title creator date type file time_interval
18 IT-LAT0001 Vulgata Hieronymus 382 poetry lat_0382_IT-LAT0001.txt 0
19 IT-LAT0537 Ars amatoria Ovidius Naso, Publius -9 poetry lat_-009_IT-LAT0537.txt -450
20 IT-LAT0011 S. Benedicti Regula Benedictus Nursianus 524 prose lat_0524_IT-LAT0011.txt 450
21 IT-LAT0012 In psalmis Davidis expositio Thomas Aquinas: Sanctus 1254 prose lat_1254_IT-LAT0012.txt
22 IT-LAT0014 Adoro te devote Thomas Aquinas: Sanctus 1254 poetry lat_1254_IT-LAT0014.txt
... ... ... ... ... ... ... ...
683 IT-LAT0534_1 De origine et situ Germanorum Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_1.txt 0
684 IT-LAT0534_2 De vita Iulii Agricolae Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_2.txt 0
685 IT-LAT0534_3 Dialogus de oratoribus Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_3.txt 0
686 IT-LAT0534_4 Historiae Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_4.txt 0
1265 IT-LAT0202 Institutiones Iustinianus, Caesar Flavius (Imperator Iustini... 533 prose lat_0533_IT-LAT0202.txt 450

670 rows × 7 columns

Read in corpus files¶

In [80]:
# define corpus subset
corpus_subset = metadata_df[metadata_df['date'] <= last_date].copy().reset_index(drop=True)
filenames_subset = corpus_subset['file'] # filenames were defined above to get IT files only
In [132]:
# prepare the corpus
punctuation = ['.', ',', '...', ';', ':', '?', '(', ')', '-', '!', '[', ']', '"', "'", '""', '\n']
In [133]:
class NltkCorpusFromDirNew(PlaintextCorpusReader):
    "A subclass of NLTK PlaintextCorpusReader"
    
    word_tokenizer=SpaceTokenizer() # tokenize on whitespace
    sent_tokenizer=LineTokenizer() # assume sentence per line
    
    def __init__(
        self,
        root,
        fileids,
        encoding="utf8",        
        word_tokenizer=word_tokenizer,
        sent_tokenizer=sent_tokenizer,
        tolower=False, punctuation=None
    ):

        PlaintextCorpusReader.__init__(self, root=root, fileids=fileids, encoding=encoding,
                                       word_tokenizer=word_tokenizer,
                                       sent_tokenizer=sent_tokenizer)
        self.tolower = tolower
        self.punctuation = punctuation
        
    def _read_word_block(self, stream):
        words = []
        for i in range(20):  # Read 20 lines at a time.
            if self.punctuation is not None:
                words.extend( [ token.lower() if self.tolower == True else token for token in self._word_tokenizer.tokenize(stream.readline()) if token not in self.punctuation  ])
            else:
                words.extend( [ token.lower() if self.tolower == True else token for token in self._word_tokenizer.tokenize(stream.readline()) ])
        return words
In [134]:
#prepare the corpus
latinise = NltkCorpusFromDirNew(root=dir_in, fileids=filenames_subset,
                                punctuation=punctuation, tolower=True)
latinise_docs = []
for fileid in latinise.fileids():
    latinise_docs.append(Text(latinise.words(fileid)))
print("This corpus contains ", len(latinise_docs), " documents.")
This corpus contains  385  documents.
In [135]:
# read in word corpus for token stats
latinise_words = NltkCorpusFromDirNew(root=dir_in_words, fileids=filenames_subset,
                                punctuation=punctuation, tolower=True)
latinise_words_docs = []
for fileid in latinise_words.fileids():
    latinise_words_docs.append(Text(latinise_words.words(fileid)))
print("This corpus contains ", len(latinise_words_docs), " documents.")
This corpus contains  385  documents.
In [136]:
metadata_df_subset = corpus_subset
metadata_df_subset.head()
Out[136]:
id title creator date type file time_interval period tokens lemmas_unique tokens_unique century
0 IT-LAT0001 Vulgata Hieronymus 382 poetry lat_0382_IT-LAT0001.txt 0 0 - 450 26760 3124 5713 4
1 IT-LAT0537 Ars amatoria Ovidius Naso, Publius -9 poetry lat_-009_IT-LAT0537.txt -450 -450 - 0 15945 3436 6083 -1
2 IT-LAT0011 S. Benedicti Regula Benedictus Nursianus 524 prose lat_0524_IT-LAT0011.txt 450 450 - 900 12156 2045 3744 6
3 IT-LAT0015 Confessiones Augustinus, Aurelius 399 prose lat_0399_IT-LAT0015.txt 0 0 - 450 5434 1298 2096 4
4 IT-LAT0016 Regula Augustinus, Aurelius 391 prose lat_0391_IT-LAT0016.txt 0 0 - 450 2249 691 1077 4
In [137]:
labels = [ ' - '.join([str(bin1), str(bin2)]) for bin1, bin2 in zip(intervals[0:len(intervals)-1],intervals[1:len(intervals)])]
metadata_df_subset["period"] = pd.cut(metadata_df_subset["date"], intervals, labels=labels, include_lowest=True)
metadata_df_subset["tokens"] = [  len(latinise.words(filename)) for filename in metadata_df_subset['file'] ]
metadata_df_subset["lemmas_unique"] = [  len(set(latinise.words(filename))) for filename in metadata_df_subset['file'] ]
metadata_df_subset["tokens_unique"] = [  len(set(latinise_words.words(filename))) for filename in metadata_df_subset['file'] ]
century = [-500,-400,-300,-200,-100,0,100,200,300,400,500,600,700,800,900,1000]
metadata_df_subset["century"] = pd.cut(metadata_df_subset["date"], century, labels=[y for y in range(-5,0)]+[y for y in range(1,11)], include_lowest=True)
In [138]:
# detect and remove empty files
metadata_df_subset = metadata_df_subset[ ~ metadata_df_subset["tokens"] < 1 ]
metadata_df_subset.head()
Out[138]:
id title creator date type file time_interval period tokens lemmas_unique tokens_unique century
0 IT-LAT0001 Vulgata Hieronymus 382 poetry lat_0382_IT-LAT0001.txt 0 0 - 450 25229 3123 5712 4
1 IT-LAT0537 Ars amatoria Ovidius Naso, Publius -9 poetry lat_-009_IT-LAT0537.txt -450 -450 - 0 14917 3435 6082 -1
2 IT-LAT0011 S. Benedicti Regula Benedictus Nursianus 524 prose lat_0524_IT-LAT0011.txt 450 450 - 900 11675 2044 3743 6
3 IT-LAT0015 Confessiones Augustinus, Aurelius 399 prose lat_0399_IT-LAT0015.txt 0 0 - 450 5172 1297 2095 4
4 IT-LAT0016 Regula Augustinus, Aurelius 391 prose lat_0391_IT-LAT0016.txt 0 0 - 450 2144 690 1076 4

Corpus stats¶

We'll start by mapping time intervals to colors.

In [36]:
period_labels = labels + ["all"]
periods2colors = { period : px.colors.qualitative.Alphabet[i] for i, period in enumerate(period_labels)} # for each period we fix a color
fig = go.Figure()
fig.add_trace(go.Bar(
    x = [col for col in periods2colors.keys()],
    y = [0.5 for x in range(0, len(periods2colors)) ],
    text = period_labels
,
    textangle=90,
    marker_color=[col for col in periods2colors.values()]
))
fig.update_layout(showlegend=False, xaxis={'showgrid': False, 'visible': False}, yaxis={'showgrid': False, 'visible': False})

We're now aggregating some corpus stats from the metadata and NLTK corpora.

We're defining NLTK conditional frequency distribution objects for a quick access to counts.

In [203]:
latinise_period = nltk.ConditionalFreqDist() #lemmas
for period in metadata_df_subset["period"].unique():
    condition = period
    for word in latinise.words( [ fname for fname in metadata_df_subset[metadata_df_subset["period"] == period]['file'] ] ):
        latinise_period[condition][word] +=1
        
latinise_all = nltk.FreqDist(latinise.words())
In [163]:
latinise_words_period = nltk.ConditionalFreqDist() #words
for period in metadata_df_subset["period"].unique():
    condition = period
    for word in latinise_words.words([ fname for fname in metadata_df_subset[metadata_df_subset["period"] == period]['file'] ]):
        latinise_words_period[condition][word] +=1
In [164]:
latinise_freqs = nltk.FreqDist(latinise.words())
latinise_words_freqs = nltk.FreqDist(latinise_words.words())
In [273]:
# most common words by period
topn = 20
most_common = []
for period in latinise_period.conditions():
    top = [ (period, i+1, tup[0], tup[1]) for i, tup in enumerate(latinise_period[period].most_common(topn)) ] # append period and explicit rank to the (word, freq) tuple
    most_common.extend(top)
  
# add total counts
for i, tup in enumerate(latinise_all.most_common(topn)):
    most_common.append(('all', i+1, tup[0], tup[1]))

most_common_df = pd.DataFrame.from_records(most_common, columns=["period", "rank", "word", "freq"])
most_common_df.head()
fig, rows_cols = build_multiplot(2, 'table', len(most_common_df["period"].unique()), [ period for period in most_common_df["period"].unique() ],
                                 shared_yaxes=True, shared_xaxes=False, vertical_spacing=0.04)
# most common lemmas in corpus
top_display=10
for i,group in enumerate(most_common_df.groupby("period")):
    df = group[1][['rank', 'word', 'freq']].iloc[:top_display]
    tbl = go.Table(
        header=dict(
            values=['rank', 'word', 'freq'],
            line_color=periods2colors[group[0]],
            fill_color=periods2colors[group[0]],
            align='center',
            font=dict(color='black', size=12)
        ),
        cells=dict(
            values=[df[col] for col in df],
        ))
    fig.add_trace(tbl, row=rows_cols[i][0], col=rows_cols[i][1])

fig.update_layout(height=700, width=700, showlegend=False, title='Top '+str(top_display)+' most common words in the corpus')
fig.show()
In [470]:
corpus_stats = []
for period in latinise_period.conditions():
    corpus_stats.append((
        period,
        latinise_period[period].N(), #number of lemmas
        latinise_period[period].B(), #number of unique lemmas
        latinise_words_period[period].N(), #number of words
        latinise_words_period[period].B(), #number of unique words
    )
    )

# add total counts
corpus_stats.append(
    (
        'all',
        latinise_freqs.N(),
        latinise_freqs.B(),
        latinise_words_freqs.N(),
        latinise_words_freqs.B()
    )
)
#latinise_words_period['0 - 450'].B()
corpus_stats_df = pd.DataFrame.from_records(corpus_stats, columns=['period', 'lemmas', 'lemmas_u', 'words', 'words_u'])
corpus_stats_df["period"] = corpus_stats_df["period"].astype("category") # cast to categorical to force custom sorting
corpus_stats_df['period'] = corpus_stats_df['period'].cat.reorder_categories(period_labels, ordered=True)
corpus_stats_df.sort_values(by='period',inplace=True)
corpus_stats_df.reset_index(inplace=True, drop=True)
corpus_stats_df["ttr"] = (corpus_stats_df["words_u"] / corpus_stats_df["words"]) * 100
corpus_stats_df.head()
Out[470]:
period lemmas lemmas_u words words_u ttr
0 -450 - 0 1395938 44861 1395858 103432 7.409923
1 0 - 450 2800542 97396 2799762 195764 6.992166
2 450 - 900 1105178 50265 1105116 97905 8.859251
3 all 5301658 142051 5300736 262883 4.959368
In [549]:
fig = px.bar(corpus_stats_df[corpus_stats_df["period"] == 'all'],
             x="period",
             y=["lemmas_u", "words_u", "words"],
             barmode="group",
             orientation='v',
            )
labels=dict(words="words", words_u="unique words", lemmas_u="unique lemmas")

fig.for_each_trace(lambda trace: trace.update(name = labels[trace.name]))

fig.update_layout(
    title="Corpus size",
    xaxis_title="period",
    yaxis_title="count",
    legend_title="",    
)

fig.show()
In [466]:
fig = px.bar(corpus_stats_df,
             y="period",
             x=["lemmas_u", "words_u", "words"],
             barmode="group",
             orientation='h',
            )
labels=dict(words="words", words_u="unique words", lemmas_u="unique lemmas")

fig.for_each_trace(lambda trace: trace.update(name = labels[trace.name]))

fig.update_layout(
    title="Number of tokens",
    xaxis_title="period",
    yaxis_title="count",
    legend_title="",    
)

fig.show()
In [434]:
fig = px.bar(corpus_stats_df,
             x="period",
             y=["lemmas_u", "words_u",],
             title="Number of unique lemmas and words",
             barmode="group"
            )
fig.show()
In [ ]:
# number of tokens per period
df = metadata_df_subset.groupby("period").agg({"tokens":"sum"}).reset_index()
fig = px.bar(df, x="period", y="tokens", color="period", color_discrete_map=periods2colors)
fig.update_layout(title="Number of tokens in corpora",barmode='stack')
fig.show()
In [548]:
fig = px.histogram(metadata_df_subset,
                   x="period", y="file",
                   histfunc='count',
             #barmode="group",
                   category_orders={"period":period_labels[:len(period_labels)-1]},
             color="period", color_discrete_map=periods2colors)
fig.update_layout(title="Number of texts by period",barmode='stack', showlegend=False)
fig.show()
In [553]:
df = metadata_df_subset.groupby(["period", "creator"]).size().reset_index()
df.columns = ["period", "creator", "count"]
df = df[df["count"] > 0]
fig = px.histogram(df,
                   x="period", y=["count"],
                   histfunc='count',
             #barmode="group",
                   category_orders={"period":period_labels[:len(period_labels)-1]},
             color="period", color_discrete_map=periods2colors)
fig.update_layout(title="Number of authors by period",barmode='stack', showlegend=False)
fig.show()
In [482]:
fig = px.histogram(metadata_df_subset,
             x="period", y="creator",
                   histfunc='count',
                   category_orders={"period":period_labels[:len(period_labels)-1]},
             color="period", color_discrete_map=periods2colors)
fig.update_layout(title="Number of authors by period",barmode='stack', showlegend=False)
fig.show()
In [554]:
fig = px.histogram(metadata_df_subset,
             x="period", y="creator",
                   histfunc='count',
                   category_orders={"period":period_labels[:len(period_labels)-1]},
             color="period", color_discrete_map=periods2colors)
fig.update_layout(title="Number of authors by period",barmode='stack', showlegend=False)
fig.show()

TODO

  • keywords (period vs. reference)
  • rank/freq (period vs. reference)
  • add from Voces (Kilgarriff, A. ‘Comparing Corpora’. International Journal of Corpus Linguistics 6, no. 1 (1 November 2001): 97–133. https://doi.org/10.1075/ijcl.6.1.05kil.)

Terms¶

We are interested in semantic change patterns of a set of terms related to the socio-political life, such as:

In [6]:
# define terms we're interested in
socio_political_terms = ["civitas", "consilium", "consul", "dux", "gens", "hostis", "imperator",
                         "jus", "labor", "natio", "nobilitas", "pontifex", "pontificium", "populus", "potestas", "regnum", "senatus", "sodes", "urbs"]
print(socio_political_terms)
['civitas', 'consilium', 'consul', 'dux', 'gens', 'hostis', 'imperator', 'jus', 'labor', 'natio', 'nobilitas', 'pontifex', 'pontificium', 'populus', 'potestas', 'regnum', 'senatus', 'sodes', 'urbs']

We're assigning each term a seperate colour to facilitate our analyses.

In [7]:
color_discrete_map_terms = { term : px.colors.qualitative.Alphabet[i] for i, term in enumerate(socio_political_terms)} # for each term we fix a color
fig = go.Figure()
fig.add_trace(go.Bar(
    x = [col for col in color_discrete_map_terms.keys()],
    y = [0.5 for x in range(0, len(color_discrete_map_terms)) ],
    text = socio_political_terms,
    textangle=90,
    marker_color=[col for col in color_discrete_map_terms.values()]
))
fig.update_layout(showlegend=False, xaxis={'showgrid': False, 'visible': False}, yaxis={'showgrid': False, 'visible': False})

Word frequency¶

In [194]:
# Terms
terms = ['gens', 'natio', 'civitas', 'populus', 'urbs']
# Fix colors for plotting purposes
color_discrete_map_terms = { term : px.colors.qualitative.Plotly[i] for i, term in enumerate(terms)}
In [195]:
# year by year frequency
dates = metadata_df_subset["date"].unique()
Out[195]:
Index(['lat_0091_IT-LAT0332.txt'], dtype='object', name='file')
In [196]:
cfd_year = nltk.ConditionalFreqDist()
for date in dates:
    condition = date
    for word in latinise.words(metadata_df_subset[metadata_df_subset["date"] == date].index):
        cfd_year[condition][word] +=1
In [197]:
#count tokens by year
freq_by_year = pd.DataFrame()
freq_by_year["year"] = pd.to_numeric(cfd_year.conditions())
freq_by_year["count"] = [ sum(cfd_year[year].values()) for year in freq_by_year["year"] ]
freq_by_year["period"] = pd.cut(freq_by_year["year"], bins=intervals,labels=labels,include_lowest=True)
In [198]:
# count term frequency by year
terms_by_year = pd.DataFrame([ (year, term, counts[term]) for year, counts in cfd_year.items() for term in terms ],
                             columns = ["year", "term", "count"])
terms_by_year["year"] = pd.to_numeric(terms_by_year["year"])
#terms_by_year["term"].astype("category")
terms_by_year["count"] = pd.to_numeric(terms_by_year["count"])
terms_by_year["period"] = pd.cut(terms_by_year["year"],bins=intervals,labels=labels,include_lowest=True)
terms_by_year.head()
Out[198]:
year term count period
0 382 gens 1 0 - 450
1 382 natio 0 0 - 450
2 382 civitas 21 0 - 450
3 382 populus 5 0 - 450
4 382 urbs 0 0 - 450
In [199]:
fig = px.scatter(terms_by_year,x="year", y="count", color="term")
fig.update_layout(title="Frequency of the terms by year")
fig.show()
In [200]:
# by period (ppm)
import numpy as np
freq_by_period = freq_by_year.groupby(["period"], as_index=False).agg({'count' : np.sum}) #all tokens by period
terms_by_period = terms_by_year.groupby(["term","period"], as_index=False).agg({'count' : np.sum})
counts_by_period = pd.merge(terms_by_period, freq_by_period, on=["period"], how="right")["count_y"]
terms_by_period["ppm"] = ( terms_by_period["count"] / counts_by_period ) * 1000000
terms_by_period.head()
Out[200]:
term period count ppm
0 civitas -450 - 0 1658 1135.150716
1 civitas 0 - 450 1531 1048.200088
2 civitas 450 - 900 1282 877.722085
3 gens -450 - 0 760 520.334466
4 gens 0 - 450 1453 994.797340
In [201]:
fig = px.line(terms_by_period,x="period", y="ppm", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period (ppm)")
fig.show()
In [202]:
# by period (raw counts)
fig = px.bar(terms_by_period,x="period", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period",barmode='stack')
fig.show()
In [203]:
# frequency by century
century = [-500,-400,-300,-200,-100,0,100,200,300,400,500,600,700,800,900,1000]
terms_by_year["century"] = pd.cut(terms_by_year["year"], bins=century, 
                                  labels=[y for y in range(-5,0)]+[y for y in range(1,11)])
terms_by_year.head()
Out[203]:
year term count period century
0 382 gens 1 0 - 450 4
1 382 natio 0 0 - 450 4
2 382 civitas 21 0 - 450 4
3 382 populus 5 0 - 450 4
4 382 urbs 0 0 - 450 4
In [204]:
terms_by_century = terms_by_year.groupby(["century", "term"]).agg({'count':np.sum}).reset_index()
In [205]:
fig = px.line(terms_by_century,x="century", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by century")
fig.show()
In [210]:
#by fileid - prepare freq distribution
import itertools
cfd_bytext = nltk.ConditionalFreqDist()
for file in [ x for x in latinise.fileids() ]:
    for word in latinise.words(file):
        cfd_bytext[file][word] +=1

terms_by_text = pd.DataFrame([ (filename, term, counts[term]) for filename, counts in cfd_bytext.items() for term in terms ],
                             columns = ["file", "term", "count"])
terms_by_text = pd.merge(terms_by_text, metadata_df_subset, on="file")
terms_by_text['ppm'] = (terms_by_text['count'] / terms_by_text['no_tokens']) * 1000000
terms_by_text.head()

# top works by period - raw
top_terms_by_text = terms_by_text.sort_values('count', ascending=False).groupby(["period","term"]).head(5).sort_values("count", ascending=False)
In [211]:
fig = px.bar(top_terms_by_text,x="id", y="count", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period", uniformtext_minsize=10, uniformtext_mode='hide')
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()
In [212]:
# top works by period - ppm
top_terms_by_text = terms_by_text.sort_values('ppm', ascending=False).groupby(["period","term"]).head(5).sort_values("ppm", ascending=False)
fig = px.bar(top_terms_by_text,x="id", y="ppm", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels}, 
             height=1200, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (ppm)",
                  uniformtext_minsize=14)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='inside', textfont_size=16)
fig.show()
In [213]:
#top_terms_by_author
top_terms_by_author = terms_by_text.sort_values('count', ascending=False).groupby(["creator","term","period"], observed=True).agg({'count':np.sum}).reset_index().sort_values("count", ascending=False)
fig = px.bar(top_terms_by_author, x="creator", y="count", color="term", facet_col="period", facet_row="term", text="creator", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["creator", "period"], color_discrete_map=color_discrete_map_terms)

fig.update_layout(title="Authors with max number of terms by period", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='outside', textfont_size=14)
fig.update_yaxes(matches=None)
fig.show()
In [214]:
# terms by genre
top_terms_by_text.head()
top_terms_by_genre = terms_by_text.sort_values('count', ascending=False).groupby(["term", "type","period"]).head(5).sort_values("count", ascending=False)
top_terms_by_genre.head()
fig = px.bar(top_terms_by_genre,x="period", y="count", color="term", facet_col="period", facet_row="type", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (raw)", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()